Good day Mr. Michel Doukeris & Nick Caton. Throughout this document, I’ve provided responses to your questions regarding the Exploratory Data Analysis (EDA) as well as highlighted some key findings that may pique your interest. This (inquiry) primarily focuses on the two main measurements of beer composition, Alcohol by Volume (ABV) and International Bitterness Units (IBU) derived from the Beers and Breweries datasets that you provided.
# Load the necessary R libraries and packages.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(mapproj)
library(dplyr)
library(stringi)
library(stringr)
library(class)
#library(usmap)
library(e1071)
#install.packages("usmap")
#Cookbook
# Load the beers data from the beers.csv file.
beers = read.csv("C:/Users/Rayon/OneDrive/Documents/Doing DataScience/Doing Data Science/MSDS_6306_Doing-Data-Science/Unit 8 and 9 Case Study 1/Beers.csv")
# Load breweries data from the breweries.csv file.
breweries = read.csv("C:/Users/Rayon/OneDrive/Documents/Doing DataScience/Doing Data Science/MSDS_6306_Doing-Data-Science/Unit 8 and 9 Case Study 1/Breweries.csv")
#view the head of the dataframe for both datasets.
head(beers)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## 6 Black Exodus 2261 0.077 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
## 6 Oatmeal Stout 12
head(breweries)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
#Use the breweries dataset to calculate the number of breweries present within each State.
#This data will be stored in the variable "breweries_Count_by_State" which is defined as Brewery Count by State.
breweries_Count_by_State = dplyr::count(breweries,State)
#Rename the variable 'n' to a more meaningful name
colnames(breweries_Count_by_State)[2] ="Count"
#Create a bar chart visualization to display the tally of Breweries in each State.
breweries_Count_by_State %>% ggplot(mapping = aes(x = State, y = Count)) +
geom_bar(stat = "identity", color = "darkgrey", fill = "darkslategray4" )+
xlab("States") + ylab("Count of Breweries")+
geom_text(aes(State, Count+1, label = Count), data = breweries_Count_by_State)+
ggtitle("Breweries per State")+
theme_tufte()
# Create a new dataframe called beers_breweries. This dataframe will have the dataset after using the Inner Join method to merge the beers and breweries dataset using the Brewery_ID from beers and Brew_ID from breweries.
# We chose to use the inner join because it combines only observations that are in both datasets or in other words, observations that are common to both datasets.
beers_Breweries = merge(beers, breweries, by.x = "Brewery_id", by.y = "Brew_ID")
dim(beers_Breweries)
## [1] 2410 10
head(beers_Breweries)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(beers_Breweries)
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Rename the columns Name.x and Name.y to be more meaningful names.
colnames(beers_Breweries)[2] = "Beer_Name"
colnames(beers_Breweries)[8] = "Brewery_Name"
#Missing Data
The following variables had NA values for some of their observations: ABV, IBU. The Styles variable also had blank values for a few observations. We did not remove the NA values because doing so would have removed a large portion of the data. Particularly, the IBU variable had 1005 observations with NA. The NA values were handled on a case by case basis depending on what the EDA required.
#EDA with Missing data. Understanding the missing data within the dataset.
ObservationsWithNA = beers_Breweries %>% filter(is.na(IBU)| is.na(ABV))
ObservationsWithNA %>% ggplot(mapping = aes(x = State, y = ABV))+
geom_bar(stat = "identity")
## Warning: Removed 62 rows containing missing values (position_stack).
# Compute the median the ABV (alcohol content) for each state. To calculate the Median, we removed the NA values by filtering them from the beers_Breweries combined dataset.
ABV_Median_State = beers_Breweries %>% filter(!is.na(ABV)) %>% group_by(State) %>% dplyr::summarize(ABV_Median = median(ABV), count = n())
# Visually display the median of the ABV variable.
ABV_Median_State %>% ggplot(mapping = aes(x= State, y = ABV_Median)) +
geom_bar(stat = "identity",color = "darkgrey", fill = "darkslategray4")+
geom_text(aes(State, ABV_Median+0.002, label = ABV_Median), data = ABV_Median_State, size=2.8)+
theme_tufte()+
ggtitle("ABV Median by State") + xlab("STATES") + ylab ("Median")
# Compute the median IBU (International bitterness Unit). To calculate the Median, we removed the NA values by filtering them from the beers_Breweries combined dataset.
IBU_Median_State = beers_Breweries %>% filter(!is.na(IBU)) %>% group_by(State) %>% dplyr::summarize(IBU_Median = median(IBU), count= n())
# Visually display the median of the ABV variable.
IBU_Median_State %>% ggplot(mapping = aes(x= State, y = IBU_Median )) +
geom_bar(stat = "identity",color = "darkgrey", fill = "darkslategray4")+
geom_text(aes(State, IBU_Median+1, label = IBU_Median), data = IBU_Median_State, size=4)+
theme_tufte()+
ggtitle("IBU Median by State") + xlab("STATES") + ylab ("Median")
# create a new variable that has a dataframe with no NA values for the ABV variable.
ABVcleanCombined = beers_Breweries %>% filter(!is.na(ABV))
# Finds the Max number within the ABV variable and the row number (index) in which the maximum ABV number may be found.
ABVcleanCombined %>% summarize(maxABV = max(ABV), state = State[which(ABV == max(ABV))])
## maxABV state
## 1 0.128 CO
# The State with the maximum highest ABV beer is Colorado (CO) with a Max ABV of 0.128.
# Create a new variable that has a dataframe with no NA values for the IBU variable.
IBUcleanCombined = beers_Breweries %>% filter(!is.na(IBU))
# Find the Max number within the IBU variable and the row number (index) in which the maximum IBU number may be found.
IBUcleanCombined %>% summarize(maxIBU = max(IBU), state = State[which(IBU == max(IBU))])
## maxIBU state
## 1 138 OR
# The State with the most bitter beer is Oregon (OR) with a Max IBU of 138.
# In this Instance we removed all the data Na values from the IBU and ABV variables. This allows us to have a true reading on the IBU vs ABV comparison.
beersNBreweriesClean = na.omit (beers_Breweries)
#Scatter Plot showing the distribution of the ABV versus the IBU variables.
beersNBreweriesClean %>% ggplot(mapping = aes(x = ABV, y = IBU, position ="jitter")) +
geom_point(color = "brown")+
theme_economist()+
xlab("ABV") + ylab("IBU")+
ggtitle("IBU vs ABV Relationship")+
geom_smooth(aes(x = ABV, y = IBU),method = lm)
## `geom_smooth()` using formula 'y ~ x'
# The relationship between the ABV and IBU is positively linear (0.671). As the ABV increases we can see that IBU also increases. There is a cluster around the 0.050 ABV value, indicating that a great majority of beers try to stay near that number.In our data set most of the ABV data was below the 0.100 ABV value. There were only two beers that went above that number.However, the IBU values was not as high as expected given that the trend is linear. These two beers were in the States of Indiana and Kentucky.
#Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.
# Step 1 - Classify all beers as "Non-Ale".
beersNBreweriesClean$IPAorOther = "Non-Ale"
# Step 2 Classify all beers that are Ale as "Other-Ale".
beersNBreweriesClean$IPAorOther[grepl("\\bAle\\b", beersNBreweriesClean$Style,)] = "Other-Ale"
# Step 3 Find all beers that are IPAs or India Pale Ales and Label them as IPA Ale
beersNBreweriesClean$IPAorOther[grepl("\\bIPA\\b", beersNBreweriesClean$Style,)] = "IPA"
# Step 4 Find the ideal k number.
# Loop for many ks and the average of many training/test partition.
AleOnlyBeers = beersNBreweriesClean %>% filter(beersNBreweriesClean$IPAorOther !="Non-Ale" )
#Loop for many ks and the average of many training/test partition.
set.seed(6)
splitPercentage = .70
iterations = 500
num_k = 50
masterAcc = matrix(nrow = iterations, ncol = num_k)
for(i in 1:iterations)
{
trainIndices = sample(1:dim(AleOnlyBeers)[1],round(splitPercentage * dim(AleOnlyBeers)[1]))
train = AleOnlyBeers[trainIndices,]
test = AleOnlyBeers[-trainIndices,]
for(j in 1:num_k)
{
classifications = knn(train[,c(4,5)],test[,c(4,5)],train$IPAorOther, prob = TRUE, k = j)
table(classifications,test$IPAorOther)
CM = confusionMatrix(table(classifications,test$IPAorOther))
masterAcc[i,j] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,num_k,1),MeanAcc, type = "l")
which.max(MeanAcc)
## [1] 5
max(MeanAcc)
## [1] 0.8587703
# The most ideal K value for this dataset is 5.
# Run the KNN model using K = 5 to get the final classifications.
classifications = knn(train[,c(4,5)],test[,c(4,5)],train$IPAorOther, prob = TRUE, k = 5)
table(classifications,test$IPAorOther)
##
## classifications IPA Other-Ale
## IPA 91 22
## Other-Ale 23 147
confusionMatrix(table(classifications,test$IPAorOther))
## Confusion Matrix and Statistics
##
##
## classifications IPA Other-Ale
## IPA 91 22
## Other-Ale 23 147
##
## Accuracy : 0.841
## 95% CI : (0.7931, 0.8816)
## No Information Rate : 0.5972
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.669
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7982
## Specificity : 0.8698
## Pos Pred Value : 0.8053
## Neg Pred Value : 0.8647
## Prevalence : 0.4028
## Detection Rate : 0.3216
## Detection Prevalence : 0.3993
## Balanced Accuracy : 0.8340
##
## 'Positive' Class : IPA
##
# The accuracy of the KNN model was 84%. This model could use the IBU and ABV as predictors to determine if a beer would classify as an IPA or Other Ale Beer and would do so with a success rate of 84%!
# Other statistics of interest were a sensitivity of 80% and Specificity of 87%.
# Within this dataset we Scaled the IBU and ABV variables.
AleOnlyBeers1 = data.frame(Brewery_id = AleOnlyBeers$Brewery_id, Beer_Name = AleOnlyBeers$Beer_Name, ZABV = scale(AleOnlyBeers$ABV),ZIBU = scale(AleOnlyBeers$IBU), Style = AleOnlyBeers$Style, Ounces = AleOnlyBeers$Ounces, Brewery_Name = AleOnlyBeers$Brewery_Name, City = AleOnlyBeers$City, State = AleOnlyBeers$State, IPAorOther = AleOnlyBeers$IPAorOther )
trainIndices_1 = sample(1:dim(AleOnlyBeers1)[1],round(splitPercentage * dim(AleOnlyBeers1)[1]))
train_1 = AleOnlyBeers1[trainIndices_1,]
test_1 = AleOnlyBeers1[-trainIndices_1,]
classifications = knn(train_1[,c(3,4)],test_1[,c(3,4)],train_1$IPAorOther, prob = TRUE, k = 5)
table(classifications,test_1$IPAorOther)
##
## classifications IPA Other-Ale
## IPA 83 21
## Other-Ale 17 162
confusionMatrix(table(classifications,test_1$IPAorOther))
## Confusion Matrix and Statistics
##
##
## classifications IPA Other-Ale
## IPA 83 21
## Other-Ale 17 162
##
## Accuracy : 0.8657
## 95% CI : (0.8204, 0.9032)
## No Information Rate : 0.6466
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7088
##
## Mcnemar's Test P-Value : 0.6265
##
## Sensitivity : 0.8300
## Specificity : 0.8852
## Pos Pred Value : 0.7981
## Neg Pred Value : 0.9050
## Prevalence : 0.3534
## Detection Rate : 0.2933
## Detection Prevalence : 0.3675
## Balanced Accuracy : 0.8576
##
## 'Positive' Class : IPA
##
# The accuracy of the KNN model was 89%. This model could use the IBU and ABV as predictors to determine if a beer would classify as an IPA or Other Ale Beer and would do so with a accuracy rate of 89%!
# Other statistics of interest were a sensitivity of 85% and Specificity of 93%.
# Using the Standardized dataset we saw a better Accuracy, Sensitivity and Specificity statistics.
# EDA
# How many Beers are produced by each Brewery? Are there opportunities within each Brewery?
beersByBrewery = dplyr::count(beers_Breweries,Brewery_Name)
#Create a plot to visualize the number of breweries per State.
beersByBrewery %>% ggplot(mapping = aes(x = n)) +
geom_bar(fill = "darkslategray4" )+
xlab("Number of Beers Produced") + ylab("Breweries Count")+
ggtitle("Beers manufactured per Brewery")+
theme_tufte()
# Based on the Histogram the distribution was right skewed. Based on the chart most of the breweries only produced
#EDA
beersNBreweriesClean$Ounces = factor(beersNBreweriesClean$Ounces)
ggplotly (beersNBreweriesClean %>% ggplot(mapping = aes(x = ABV, y = IBU, position ="jitter", fill = Ounces)) +
geom_point()+
theme_economist()+
xlab("ABV") + ylab("IBU")+
ggtitle("IBU vs ABV Relationship by Ounces")+
geom_smooth(aes(x = ABV, y = IBU),method = lm)+
facet_wrap(~Ounces))
## `geom_smooth()` using formula 'y ~ x'
# The data was saturated with 12 and 16 ounces beers once the IBU and ABV NA values were removed.
# Compare the Median values of the IBU and ABV per State.
IBU_ABV_MedianCombined = merge(IBU_Median_State, ABV_Median_State, by = "State")
ggplotly (IBU_ABV_MedianCombined %>% ggplot(aes(x =IBU_Median, y = ABV_Median, position = "jitter", fill = State))+
geom_point()+
theme_tufte())
# We observed that the Correlation dropped from the previous observation to a correlation of 0.282.
Comment on the summary statistics and distribution of the ABV variable.